Influenza Vaccination Impact on DNA Methylation and Gene Regulation in UGA6 Cohort

EWAS and Immune Response Related CpGs

Author

Jiachen Ai

Load Libraries

Code
library(readr)
library(tidyverse)
library(data.table)
library(ggplot2)
library(dplyr)
library(gtsummary)
library(tidymodels)
library(tidytext)
library(embed)
library(knitr)
library(Seurat)
library(tibble)
library(caret)
library(mixOmics)
library(GGally)
library(glmnet)
library(parsnip)
library(corrplot)
library(recipes)
library(caret)
library(ggpmisc)
library(boot)
library(ggcorrplot)
library(factoextra)
library(stringr)
library(RColorBrewer)
library(combinat)
library(broom)
library(ggpubr)
library(gridExtra)
library(biomaRt)
library(MASS)
library(limma)
library(patchwork)
library(purrr)
# display.brewer.all()

Data Preparation

Study cohort

study_cohort <- readLines("/Users/jacenai/Desktop/Matteo_Lab/UGA6/methylation/final_files/samples_to_keep.txt")

cohort_info <- read_csv("/Users/jacenai/Desktop/Matteo_Lab/UGA6/methylation/final_files/Traits_HumanBlood256_Reed2023-9206_021624mjt_HP_updated.csv") %>% filter(sampleID %in% study_cohort) %>%
  mutate(cohort = if_else(UGA45 == FALSE, "UGA6", "UGA45")) %>%
  dplyr::select(-UGA45) 

Methylation Profiles

MethMatrix <- read_csv("/Users/jacenai/Desktop/Matteo_Lab/UGA6/methylation/MethMatrixCG-CIVR-UGA6_256_20-100.csv") |>
  filter(sampleID %in% study_cohort) 
  
# Remove the first column (sampleID)
MethMatrix_1 <- MethMatrix[,-1]

# check the distribution of ramdomly selected CpG sites 
MethMatrix_1 |> 
  t() |>
  as.data.frame() |>
  sample_n(12) |> 
  t() |>
  as.data.frame() |>
  pivot_longer(everything(), names_to = "sampleID", values_to = "methylation") |>
  ggplot(aes(x = methylation)) +
  geom_histogram(bins = 30) +
  facet_wrap(~sampleID, scales = "fixed") +
  theme_minimal()

Annotations

# install.packages("BiocManager")
# BiocManager::install(c("biomaRt", "GenomicRanges"))
suppressPackageStartupMessages({
  library(biomaRt)
  library(GenomicRanges)
  library(dplyr)
  library(readr)
})

# helper to get an Ensembl mart for chosen build
get_mart <- function(build = c("GRCh38","GRCh37")) {
  build <- match.arg(build)
  if (build == "GRCh37") {
    useEnsembl(biomart = "genes",
               dataset = "hsapiens_gene_ensembl",
               GRCh = 37)
  } else {
    useEnsembl(biomart = "genes",
               dataset = "hsapiens_gene_ensembl",
               GRCh = 38)
  }
}

# fetch (or load cached) gene coordinates from Ensembl
load_ensembl_genes <- function(build = c("GRCh38","GRCh37"),
                               cache_file = NULL) {
  build <- match.arg(build)
  if (is.null(cache_file)) cache_file <- paste0("ensembl_genes_", build, ".rds")

  if (file.exists(cache_file)) {
    return(readRDS(cache_file))
  }

  mart <- get_mart(build)
  # pull basic gene-level coordinates
  genes <- getBM(
    attributes = c("ensembl_gene_id", "hgnc_symbol",
                   "chromosome_name", "start_position",
                   "end_position", "strand"),
    filters = "chromosome_name",
    values  = c(1:22, "X", "Y"),
    mart    = mart
  ) |>
    mutate(
      chromosome = paste0("chr", chromosome_name),
      tss = ifelse(strand == 1, start_position, end_position)
    ) |>
    dplyr::select(ensembl_gene_id, hgnc_symbol, chromosome,
           start_position, end_position, strand, tss)

  saveRDS(genes, cache_file)
  genes
}

# main: annotate CpGs
annotate_cpgs_biomart <- function(cpg_ids,
                                  build = c("GRCh38","GRCh37"),
                                  promoter_up = 1500,
                                  promoter_down = 250,
                                  gene_cache = NULL) {
  build <- match.arg(build)

  # 1) Parse CpGs -> GRanges (keep standard chromosomes only)
  chr <- sub("^(chr[^_]+)_.*$", "\\1", cpg_ids)
  pos <- as.integer(sub("^chr[^_]+_(\\d+)$", "\\1", cpg_ids))
  keep <- chr %in% paste0("chr", c(1:22, "X", "Y"))
  if (!all(keep)) message(sprintf("Dropping %d CpGs on non-standard chromosomes/contigs.", sum(!keep)))
  cpg_ids <- cpg_ids[keep]; chr <- chr[keep]; pos <- pos[keep]
  cpg_gr  <- GRanges(seqnames = chr, ranges = IRanges(pos, pos))

  # 2) Get Ensembl gene coordinates
  genes <- load_ensembl_genes(build, cache_file = gene_cache)

  # 3) GRanges for gene bodies, promoters, TSS
  gene_gr <- GRanges(
    seqnames = genes$chromosome,
    ranges   = IRanges(genes$start_position, genes$end_position),
    strand   = ifelse(genes$strand == 1, "+", "-"),
    gene_id  = genes$ensembl_gene_id,
    symbol   = genes$hgnc_symbol,
    tss      = genes$tss
  )

  prom_start <- ifelse(strand(gene_gr) == "+", genes$tss - promoter_up,   genes$tss - promoter_down)
  prom_end   <- ifelse(strand(gene_gr) == "+", genes$tss + promoter_down, genes$tss + promoter_up)

  promoter_gr <- GRanges(
    seqnames = seqnames(gene_gr),
    ranges   = IRanges(start = pmax(1L, prom_start), end = pmax(prom_end, prom_start)),
    strand   = strand(gene_gr),
    gene_id  = mcols(gene_gr)$gene_id,
    symbol   = mcols(gene_gr)$symbol
  )

  tss_gr <- GRanges(
    seqnames = seqnames(gene_gr),
    ranges   = IRanges(start = genes$tss, end = genes$tss),
    strand   = strand(gene_gr),
    gene_id  = mcols(gene_gr)$gene_id,
    symbol   = mcols(gene_gr)$symbol
  )

  # 4) Overlaps
  ovl_prom <- findOverlaps(cpg_gr, promoter_gr, ignore.strand = TRUE)
  ovl_gene <- findOverlaps(cpg_gr, gene_gr,     ignore.strand = TRUE)

  pick_first <- function(hits, qlen) {
    out <- rep(NA_integer_, qlen)
    if (length(hits)) out[queryHits(hits)] <- subjectHits(hits)
    out
  }
  idx_prom <- pick_first(ovl_prom, length(cpg_gr))
  idx_gene <- pick_first(ovl_gene, length(cpg_gr))
  idx_tss  <- nearest(cpg_gr, tss_gr, select = "arbitrary", ignore.strand = TRUE)

  region <- ifelse(!is.na(idx_prom), "Promoter",
                   ifelse(!is.na(idx_gene), "GeneBody", "Intergenic"))

  # picked gene (promoter > gene body > nearest)
  pick_symbol <- function(i_prom, i_gene, i_tss) {
    sym <- rep(NA_character_, length(i_tss))
    sym[!is.na(i_prom)] <- mcols(promoter_gr)$symbol[i_prom[!is.na(i_prom)]]
    need <- is.na(sym) & !is.na(i_gene); sym[need] <- mcols(gene_gr)$symbol[i_gene[need]]
    need <- is.na(sym) & !is.na(i_tss);  sym[need] <- mcols(tss_gr)$symbol[i_tss[need]]
    sym
  }
  pick_geneid <- function(i_prom, i_gene, i_tss) {
    gid <- rep(NA_character_, length(i_tss))
    gid[!is.na(i_prom)] <- mcols(promoter_gr)$gene_id[i_prom[!is.na(i_prom)]]
    need <- is.na(gid) & !is.na(i_gene); gid[need] <- mcols(gene_gr)$gene_id[i_gene[need]]
    need <- is.na(gid) & !is.na(i_tss);  gid[need] <- mcols(tss_gr)$gene_id[i_tss[need]]
    gid
  }

  gene_symbol <- pick_symbol(idx_prom, idx_gene, idx_tss)
  gene_id     <- pick_geneid(idx_prom, idx_gene, idx_tss)

  # explicit promoter gene symbol column (only when Region == "Promoter")
  promoter_gene <- rep(NA_character_, length(cpg_gr))
  is_prom <- !is.na(idx_prom)
  promoter_gene[is_prom] <- mcols(promoter_gr)$symbol[idx_prom[is_prom]]

  # distance to nearest TSS
  dist_to_tss <- rep(NA_integer_, length(cpg_gr))
  okn <- !is.na(idx_tss)
  if (any(okn)) dist_to_tss[okn] <- GenomicRanges::distance(cpg_gr[okn], tss_gr[idx_tss[okn]])

  tibble::tibble(
    CpG_ID        = cpg_ids,
    Chromosome    = as.character(GenomeInfoDb::seqnames(cpg_gr)),
    Position      = GenomicRanges::start(cpg_gr),
    Region        = region,                 # Promoter / GeneBody / Intergenic
    Ensembl_ID    = gene_id,
    Gene_symbol   = gene_symbol,            # rule: promoter > gene body > nearest
    Promoter_gene = promoter_gene,          # NEW: only filled for promoters
    Nearest_gene  = ifelse(!is.na(idx_tss), mcols(tss_gr)$symbol[idx_tss], NA_character_),
    Dist_to_TSSbp = dist_to_tss
  )
}
# CpGs (e.g., from MethMatrix)
cpgs <- colnames(MethMatrix)[-1]   # "chr10_100371240", ...
ann <- annotate_cpgs_biomart(cpgs, build = "GRCh38")     # or "GRCh37"

# Save as .txt file
# write.table(ann, "/Users/jacenai/Desktop/Matteo_Lab/UGA6/methylation/final_files/cpg_annotation.txt", sep = "\t", row.names = FALSE, quote = FALSE)
# Read the annotation file
cpg_annotation_df <- 
  read.table("/Users/jacenai/Desktop/Matteo_Lab/UGA6/methylation/final_files/cpg_annotation.txt", header = TRUE, sep = "\t") |>
  transmute(
    CpG         = CpG_ID,
    Ensembl_ID  = Ensembl_ID,
    Gene = case_when(
      Region == "Promoter"  ~ coalesce(Promoter_gene, Gene_symbol), # fallback if missing
      Region == "GeneBody"  ~ Gene_symbol,
      Region == "Intergenic"~ Nearest_gene,
      TRUE                  ~ NA_character_
    )
  ) 

Traits

Background

Four vaccination strains:

  • Vic19 (A/Victoria/2570/2019) - H1N1
  • Tas20 (A/Tasmania/503/2020) - H3N2
  • Phu13 (B/Phuket/3073/2013) - B-Victoria
  • Wa19 (B/Washington/2/2019) - B-Yamagata

To enhance analysis of immune response, I derived three new predictors from the raw hemagglutination inhibition (HAI) titre values across four vaccine strains:

  • The first predictor, log_d0_sp_sum, represents the natural logarithmic sum of HAI titre values at day 0 for each strain, indicating baseline seroprotection levels.

\[ \text{log\_d0\_sp\_sum} = \ln(\text{HAI}_{\text{Vic19, d0}} + \text{HAI}_{\text{Tas20, d0}} + \text{HAI}_{\text{Phu13, d0}} + \text{HAI}_{\text{Wa19, d0}}) \]

  • Similarly, log_d28_sp_sum is the natural logarithmic sum of HAI titre values at day 28, reflecting post-vaccination seroprotection levels.

\[ \text{log\_d28\_sp\_sum} = \ln(\text{HAI}_{\text{Vic19, d28}} + \text{HAI}_{\text{Tas20, d28}} + \text{HAI}_{\text{Phu13, d28}} + \text{HAI}_{\text{Wa19, d28}}) \]

  • The third predictor, day28_sc_foldchange, quantifies the fold change in seroprotection between day 0 and day 28, calculated as the ratio of seroprotection levels at day 28 to those at day 0.

\[ \text{d28\_sc\_foldchange} = \frac{\text{HAI}_{\text{Vic19, d28}} + \text{HAI}_{\text{Tas20, d28}} + \text{HAI}_{\text{Phu13, d28}} + \text{HAI}_{\text{Wa19, d28}}}{\text{HAI}_{\text{Vic19, d0}} + \text{HAI}_{\text{Tas20, d0}} + \text{HAI}_{\text{Phu13, d0}} + \text{HAI}_{\text{Wa19, d0}}} \]

traits <- read_csv(
  "/Users/jacenai/Desktop/Matteo_Lab/UGA6/methylation/Traits_HumanBlood256_Reed2023-9206_021624mjt_HP(revised).csv") |>
  filter(sampleID %in% study_cohort) 

# calculate the sum of seroprotection titers at day 0 and day 28
traits <- traits |>
  mutate(d0_sp_sum = d0_titre_vic19 + d0_titre_tas20 + d0_titre_phu13 + d0_titre_wa19) |>
  mutate(d28_sp_sum = d28_titre_vic19 + d28_titre_tas20 + d28_titre_phu13 + d28_titre_wa19) |>
  mutate(log_d0_sp_sum = log(d0_titre_vic19 + d0_titre_tas20 + d0_titre_phu13 + d0_titre_wa19)) |>
  mutate(log_d28_sp_sum = log(d28_titre_vic19 + d28_titre_tas20 + d28_titre_phu13 + d28_titre_wa19)) |>
  mutate(d28_sc_foldchange =  d28_sp_sum / d0_sp_sum) |>
  mutate(log_d28_sc_foldchange = log(d28_sc_foldchange)) 

# Read deconvoluted cell types originated from methylation data by CellFi
celltype_methylation <- read_delim(
  "/Users/jacenai/Desktop/Matteo_Lab/UGA6/methylation/cell_type_specific_methy.txt",
  delim = "\t",
  col_names = TRUE) |> 
  as_tibble() |>
  dplyr::select(-Residual) |>
  dplyr::rename(sampleID = `Samples`) |>
  # change the first _ of column `Samples` to -
  dplyr::mutate(sampleID = sub("_", "-", sampleID)) |>
  filter(sampleID %in% study_cohort) 

# Run PCA on Cell type proportion
celltype_methylation_pca <- celltype_methylation |> 
  dplyr::select(-sampleID) |> 
  prcomp(center = TRUE,
         scale = FALSE) |>
  # extract the first principal component because it explains 60% of the variance
  predict(celltype_methylation) |>
  as_tibble() |>
  dplyr::select(PC1) |>
  mutate(sampleID = celltype_methylation$sampleID) |>
  rename_with(~str_replace(., "PC", "Cell_Type_PC"), everything())

traits <- traits |> 
  left_join(celltype_methylation_pca, by = "sampleID")

# small the traits to only the columns we need
traits <- traits |> 
  dplyr::select("sampleID",
                "Cell_Type_PC1",
                "log_d0_sp_sum",
                "log_d28_sp_sum",
                "d28_sc_foldchange",
                "seroresponse_3grp",
                "seroresponse_2grp",
                "age",
                "bmi",
                "sex",
                "ancestry",
                # "plate",
                "vaccine",
                "plate"
                )

Imputation

Impute missing values in traits

# Check missing values of the data for traits
colSums(is.na(traits)) |> 
  # pick out the columns with missing values
  as.data.frame() |>
  filter(colSums(is.na(traits)) > 0)
    colSums(is.na(traits))
bmi                      4
# Check missing values of the data for MethMatrix
colSums(is.na(MethMatrix)) |>
  as.data.frame() |>
  filter(colSums(is.na(MethMatrix)) > 0)
[1] colSums(is.na(MethMatrix))
<0 rows> (or 0-length row.names)
# impute missing values with median
traits <- recipe(bmi ~ ., data = traits) |> 
  step_impute_median(bmi) |>
  prep() |>
  bake(new_data = NULL)

Plots

Plot the cell type proportions distribution

# Use Brewer's "Blues" palette with 6 values
blue_palette <- brewer.pal(n = 9, name = "Blues")[4:10]

# Prepare data with ordered factor levels by median
plot_data <- celltype_methylation %>%
  dplyr::select(-sampleID) %>%
  pivot_longer(cols = everything(), names_to = "Variables", values_to = "value") %>%
  group_by(Variables) %>%
  mutate(mean_value = mean(value, na.rm = TRUE)) %>%  # or use mean(value) for mean-based order
  ungroup() %>%
  mutate(Variables = factor(Variables, levels = unique(Variables[order(mean_value)])))


p <- ggplot(plot_data, aes(x = Variables, y = value * 100, fill = Variables)) +
  gghalves::geom_half_violin(
    side = "l",  # left side
    trim = FALSE,
    color = NA,
    width = 0.9
  ) +
  gghalves::geom_half_boxplot(
    side = "r",  # right side
    width = 0.2,
    fill = "white",
    outlier.size = 0.8,
    alpha = 0.7
  ) +
  scale_fill_manual(values = blue_palette) +
  labs(x = "", y = "Proportion (%)") +
  scale_x_discrete(expand = c(0.01, 0)) +
  theme_classic() +
  theme(
    legend.position = "none",
    axis.text.x = element_text(size = 12),
    axis.text.y = element_text(size = 12),
    axis.title.y = element_text(size = 12),
    axis.line = element_line(size = 0.5),
    axis.ticks = element_line(size = 0.5)
  )
Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
ℹ Please use the `linewidth` argument instead.
# Save the plot
# ggsave("/Users/jacenai/Desktop/Matteo_Lab/UGA6/final.plots/celltype_violin_boxplot.png", plot = p, width = 6, height = 4, dpi = 300)

p

Correlation of traits before sequential orthogonalization

raw_traits <- traits %>%
  dplyr::select(-sampleID) %>%
  mutate(ancestry = ifelse(ancestry == 0, "white", "non-white")) %>%
  mutate(ancestry = as.factor(ancestry)) %>%
  mutate(sex = as.factor(sex)) %>%
  mutate(sex = ifelse(sex == 0, "male", "female")) %>%
  # mutate(vaccine = as.factor(vaccine)) %>%
  # # mutate(plate = as.factor(plate)) %>%
  mutate(vaccine = ifelse(vaccine == 0, "standard dose", "high dose")) %>%
  dplyr::select(Cell_Type_PC1, log_d28_sp_sum, log_d0_sp_sum, 
                d28_sc_foldchange, age, bmi, sex, ancestry, vaccine)

# plot the upper triangular heatmap to see the correlation between the variables
cor(traits %>% dplyr::select(Cell_Type_PC1, log_d28_sp_sum, log_d0_sp_sum,
                d28_sc_foldchange, age, bmi, sex, ancestry, vaccine)) |>
  corrplot::corrplot(upper = "ellipse", tl.col = "black", tl.srt = 45,
                     type = "upper", diag = F, method = "color")
Warning in text.default(pos.xlabel[, 1], pos.xlabel[, 2], newcolnames, srt =
tl.srt, : "upper" is not a graphical parameter
Warning in text.default(pos.ylabel[, 1], pos.ylabel[, 2], newrownames, col =
tl.col, : "upper" is not a graphical parameter
Warning in title(title, ...): "upper" is not a graphical parameter

# use this!
ggpairs(raw_traits,
        title = "Pairwise plot of raw traits") %>% suppressMessages()

Based on EDA, we finally include age, sex, Cell_Type_PC1, bmi, ancestry, vaccine, log_d0_sp_sum, log_d28_sp_sum, and d28_sc_foldchange as predictors in the model.

Sequential Orthogonalization

Problem

In the linear model \(\mathbf{Y} = \beta_0 \mathbf{1}_n + \mathbf{X} \boldsymbol{\beta} + \boldsymbol{\varepsilon}\), where \(\mathbf{Y} \in \mathbb{R}^n\) is the response vector, \(\mathbf{X} = [\mathbf{X}_1,\dots,\mathbf{X}_p] \in \mathbb{R}^{n \times p}\) is the design matrix of predictors, \(\boldsymbol{\beta} \in \mathbb{R}^p\) is the coefficient vector, and \(\boldsymbol{\varepsilon} \in \mathbb{R}^n\) is the error term. Identifiability of the regression coefficients requires that \(\mathbf{X}\) have full column rank, i.e., \(\operatorname{rank}(\mathbf{X}) = p\). When predictors are collinear, \(\mathbf{X}^\top \mathbf{X}\) becomes singular or nearly singular, and the ordinary least squares estimator \(\hat{\boldsymbol{\beta}} = (\mathbf{X}^\top \mathbf{X})^{-1}\mathbf{X}^\top \mathbf{Y}\) is either undefined or numerically unstable. In such cases, identifiability of the individual coefficients fails, since whenever \(\operatorname{rank}(\mathbf{X}) < p\), there exist distinct coefficient vectors \(\boldsymbol{\beta}, \boldsymbol{\beta}' \in \mathbb{R}^p\) with \(\boldsymbol{\beta} \neq \boldsymbol{\beta}'\) such that \(\mathbf{X}\boldsymbol{\beta} = \mathbf{X}\boldsymbol{\beta}'\), and hence the fitted values \(\hat{\mathbf{Y}} = \mathbf{X}\boldsymbol{\beta}\) are not uniquely determined by the coefficients. This issue arises frequently in DNA methylation studies. For example, when modeling methylation levels as the response with age and immune response variables as predictors, age is often strongly correlated with both methylation levels (e.g., through epigenetic clocks) and immune response measures. Such correlations induce collinearity in the design matrix, reducing its effective rank and making it difficult to disentangle the effects of immune response variables on methylation from confounding by age.

Proposed Solution

To restore identifiability, we propose a sequential orthogonalization procedure inspired by the Gram–Schmidt process. For each predictor \(\mathbf{X}_j \in \mathbb{R}^n\), define \[ \mathbf{X}_j^\perp = \mathbf{X}_j - \mathcal{P}_{\{\mathbf{X}_1^\perp,\dots,\mathbf{X}_{j-1}^\perp\}}(\mathbf{X}_j), \] where \(\mathcal{P}_{\{\mathbf{X}_1^\perp,\dots,\mathbf{X}_{j-1}^\perp\}}(\mathbf{X}_j)\) denotes the projection of \(\mathbf{X}_j\) onto the span of the previously orthogonalized predictors. By construction, the transformed design matrix \(\mathbf{X}^\perp = \big(\mathbf{X}_1^\perp, \dots, \mathbf{X}_p^\perp\big)\) has orthogonal columns, i.e., \((\mathbf{X}^\perp)^\top \mathbf{X}^\perp\) is diagonal. This ensures full column rank and yields identifiable coefficient estimates that represent the unique contribution of each predictor to the response.

Gram-Schmidt Process

The Gram-Schmidt process is an algorithm used to convert a set of linearly independent vectors into an orthonormal basis. Given a set of vectors \(\mathbf{a}_1, \mathbf{a}_2, \ldots, \mathbf{a}_k \in \mathbb{R}^n\), the Gram-Schmidt process constructs an orthonormal set \(\mathbf{q}_1, \mathbf{q}_2, \ldots, \mathbf{q}_k\) such that:

  • Each vector \(\mathbf{q}_i\) is orthogonal to all preceding vectors \(\mathbf{q}_j\) for \(j < i\).
  • Each vector \(\mathbf{q}_i\) has unit length, i.e., \(|\mathbf{q}_i| = 1\).

This process is fundamental in linear algebra for applications that require orthogonal bases, such as QR decomposition and numerical stability improvements in computational algorithms. The process can be summarized as follows:

Let \(\mathbf{a}_1\) and \(\mathbf{a}_2\) be two linearly independent vectors in \(\mathbb{R}^2\). The Gram-Schmidt process for these vectors proceeds as follows:

  1. Orthogonalize \(\mathbf{a}_1\):
    • Set \(\tilde{\mathbf{q}}_1 = \mathbf{a}_1\).
  2. Normalize \(\tilde{\mathbf{q}}_1\):
    • Define \(\mathbf{q}_1 = \frac{\tilde{\mathbf{q}}_1}{|\tilde{\mathbf{q}}_1|}\), making \(\mathbf{q}_1\) a unit vector in the direction of \(\tilde{\mathbf{q}}_1\).
  3. Orthogonalize \(\mathbf{a}_2\):
    • Remove the component of \(\mathbf{a}_2\) in the direction of \(\mathbf{q}_1\) by defining \(\tilde{\mathbf{q}}_2 = \mathbf{a}_2 - (\mathbf{q}_1 \cdot \mathbf{a}_2) \mathbf{q}_1\).
  4. Normalize \(\tilde{\mathbf{q}}_2\):
    • Define \(\mathbf{q}_2 = \frac{\tilde{\mathbf{q}}_2}{|\tilde{\mathbf{q}}_2|}\), creating a unit vector orthogonal to \(\mathbf{q}_1\).

These steps ensure that the resulting set \({\mathbf{q}_1, \mathbf{q}_2}\) forms an orthonormal basis for the span of \({\mathbf{a}_1, \mathbf{a}_2}\).

Below, we provide R code to visualize each step in the Gram-Schmidt process for two vectors \(\mathbf{a}_1 = (0.5, 2)\) and \(\mathbf{a}_2 = (1.5, 0.5)\).

Code
# Load required libraries
library(ggplot2)
library(gridExtra)
library(grid)   # for unit()

# --- Inputs ---
a1 <- c(2, 1)
a2 <- c(1, 2)

# --- Helpers ---
norm2 <- function(v) sqrt(sum(v^2))
normalize <- function(v) v / sqrt(sum(v^2))

# draw a vector from origin with an arrow and a parsed label
geom_vec <- function(v, col="black", lbl=NULL, offset=c(0.2, 0.0), size=0.7) {
  list(
    geom_segment(aes(x=0, y=0, xend=v[1], yend=v[2]),
                 arrow = arrow(length = unit(0.2, "cm"), type="closed"),
                 color = col, linewidth = size),
    if (!is.null(lbl))
      annotate("text", x = v[1] + offset[1], y = v[2] + offset[2],
               label = lbl, parse = TRUE, color = col, fontface = 2)
  )
}

# smart limits
all_pts <- rbind(c(0,0), a1, a2)
lims <- max(abs(all_pts)) + 0.6

base_theme <- theme_minimal(base_size = 12) +
  theme(panel.grid.minor = element_blank())

base_canvas <- function(title) {
  ggplot() +
    geom_hline(yintercept = 0, color = "grey70") +
    geom_vline(xintercept = 0, color = "grey70") +
    coord_fixed(xlim = c(-lims, lims), ylim = c(-lims, lims)) +
    labs(title = title) +
    base_theme
}

# --- Gram–Schmidt steps ---

## Step 0
p1 <- base_canvas("") +
  geom_vec(a1, col="#7B1FA2", lbl="a[1]") +
  geom_vec(a2, col="#7B1FA2", lbl="a[2]", offset=c(0.2,-0.15)) +
  labs(title = "Step~0:~Initial~vectors")

## Step 1
q_tilde1 <- a1
p2 <- base_canvas("") +
  geom_vec(q_tilde1, col="#D32F2F", lbl="tilde(q)[1]") +
  geom_vec(a2, col="#7B1FA2", lbl="a[2]", offset=c(0.2,-0.15)) +
  labs(title = expression(Step~1*":"~~tilde(q)[1] == a[1]))

## Step 2
q1 <- normalize(q_tilde1)
p3 <- base_canvas("") +
  geom_vec(q1, col="#2E7D32", lbl="q[1]") +
  geom_vec(a2, col="#7B1FA2", lbl="a[2]", offset=c(0.2,-0.15)) +
  labs(title = expression(Step~2*":"~~q[1] == frac(tilde(q)[1], "||"*tilde(q)[1]*"||")))

## Step 3
proj_len <- sum(a2 * q1)
proj_vec <- proj_len * q1
q_tilde2 <- a2 - proj_vec
p4 <- base_canvas("") +
  geom_vec(q1, col="#2E7D32", lbl="q[1]") +
  geom_vec(a2, col="#7B1FA2", lbl="a[2]", offset=c(0.2,-0.15)) +
  geom_segment(aes(x=0, y=0, xend=proj_vec[1], yend=proj_vec[2]),
               linetype="dashed", color="#2E7D32") +
  annotate("text", x = proj_vec[1]+0.8, y = proj_vec[2]+0.2,
           label="(a[2] %.% q[1]) * q[1]", parse=TRUE, color="#2E7D32") +
  geom_vec(q_tilde2, col="#D32F2F", lbl="tilde(q)[2]") +
  labs(title = expression(Step~3*":"~~tilde(q)[2] == a[2] - (a[2] %.% q[1])~q[1]))

## Step 4
q2 <- normalize(q_tilde2)
p5 <- base_canvas("") +
  geom_vec(q1, col="#2E7D32", lbl="q[1]", offset=c(0.25, 0.1)) +
  geom_vec(q2, col="#2E7D32", lbl="q[2]") +
  labs(title = expression(Step~4*":"~~q[2] == frac(tilde(q)[2], "||"*tilde(q)[2]*"||")))


# Arrange
grid.arrange(p1, p2, p3, p4, p5, ncol = 2)

The orthogonalization process sequence matters. The sequence is:

age ->
sex ->
Cell_Type_PC1 ->
bmi ->
ancestry ->
vaccine ->
log_d28_sp_sum ->
log_d0_sp_sum ->
d28_sc_foldchange.

And I didn’t normalize the orthogonalized factors.

Implementation of sequential orthogonalization

# adjust the traits
orthogonalize <- function(df, traits) {
  
  # Initialize a list to hold the orthogonalized predictors
  orthogonalized <- df
  
  # Loop over each trait to orthogonalize
  for (i in seq_along(traits)) {
    current_trait <- traits[i]
    
    if (i == 1) next  # Skip the first trait because it has no predictors before it
    
    # Regress the current trait on all previous traits
    model <- lm(as.formula(paste(current_trait, "~", 
                                 paste(traits[1:(i-1)], 
                                       collapse = "+"))), 
                data = orthogonalized)
    
    # Replace the current trait with its residuals
    orthogonalized[[current_trait]] <- residuals(model)
  }
  
  return(orthogonalized)
}

Sequential orthogonalization relies on the sequence of traits provided. Here, I will orthogonalize the traits in the order of age, sex, Cell_Type_PC1, bmi, ancestry, vaccine, log_d28_sp_sum, log_d0_sp_sum, and d28_sc_foldchange because I want to remove the effects of the previous traits on the three immune response traits (log_d28_sp_sum, log_d0_sp_sum, and d28_sc_foldchange).

interested_traits <- c(
    "age",
    "sex",
    "Cell_Type_PC1",
    "bmi",
    "ancestry",
    "vaccine",
    "log_d28_sp_sum",
    "log_d0_sp_sum",
    "d28_sc_foldchange"
    )

# orthogonalize the predictors
orthogonalized_traits_PCA <- orthogonalize(traits[interested_traits],
                                           interested_traits) 


# use this!
ggpairs(orthogonalized_traits_PCA,
        title = "Pairwise Plots of Orthogonalized Traits")

EDA after Sequential Orthogonalization

Plot the PCA plot for each variable

# do PCA on the methylation data
MethMatrix_1_pca <- prcomp(MethMatrix %>% as.data.frame() %>% 
                             column_to_rownames(var = "sampleID"),
                           scale = FALSE, center = TRUE)
# calculate the percentage of variance explained by each principal component
round(MethMatrix_1_pca$sd^2 / sum(MethMatrix_1_pca$sd^2) * 100, 2)
  [1] 14.28  9.83  6.70  2.35  1.64  1.36  1.03  0.87  0.82  0.80  0.71  0.67
 [13]  0.63  0.59  0.58  0.56  0.53  0.53  0.51  0.51  0.50  0.49  0.48  0.47
 [25]  0.47  0.46  0.46  0.45  0.45  0.44  0.44  0.43  0.42  0.42  0.41  0.41
 [37]  0.41  0.41  0.40  0.40  0.39  0.39  0.39  0.38  0.38  0.38  0.37  0.37
 [49]  0.36  0.36  0.36  0.35  0.35  0.35  0.34  0.34  0.34  0.34  0.33  0.33
 [61]  0.33  0.33  0.33  0.33  0.32  0.32  0.32  0.32  0.31  0.31  0.31  0.31
 [73]  0.31  0.30  0.30  0.30  0.30  0.30  0.29  0.29  0.29  0.29  0.29  0.29
 [85]  0.28  0.28  0.28  0.28  0.28  0.27  0.27  0.27  0.27  0.27  0.27  0.27
 [97]  0.26  0.26  0.26  0.26  0.26  0.26  0.26  0.26  0.25  0.25  0.25  0.25
[109]  0.25  0.25  0.24  0.24  0.24  0.24  0.24  0.24  0.24  0.23  0.23  0.23
[121]  0.23  0.23  0.23  0.23  0.23  0.22  0.22  0.22  0.22  0.22  0.22  0.22
[133]  0.22  0.22  0.22  0.22  0.21  0.21  0.21  0.21  0.21  0.21  0.21  0.20
[145]  0.20  0.20  0.20  0.20  0.20  0.20  0.20  0.20  0.20  0.20  0.19  0.19
[157]  0.19  0.19  0.19  0.19  0.19  0.19  0.19  0.19  0.18  0.18  0.18  0.18
[169]  0.18  0.18  0.18  0.18  0.18  0.18  0.18  0.17  0.17  0.17  0.17  0.17
[181]  0.17  0.17  0.17  0.17  0.17  0.17  0.17  0.16  0.16  0.16  0.16  0.16
[193]  0.16  0.16  0.16  0.16  0.16  0.16  0.16  0.15  0.15  0.15  0.15  0.15
[205]  0.15  0.15  0.15  0.15  0.15  0.15  0.15  0.15  0.14  0.14  0.14  0.14
[217]  0.14  0.14  0.14  0.14  0.14  0.13  0.13  0.13  0.13  0.13  0.13  0.13
[229]  0.13  0.13  0.13  0.13  0.13  0.12  0.12  0.12  0.12  0.12  0.12  0.12
[241]  0.12  0.12  0.11  0.11  0.11  0.11  0.11  0.11  0.11  0.10  0.09  0.00
# extract the first two principal components
MethMatrix_1_pca_df <- as.data.frame(MethMatrix_1_pca$x) %>%
  dplyr::select(PC1, PC2) %>%
  rownames_to_column(var = "sampleID") %>%
  left_join(traits, 
            by = "sampleID") %>%
  # if ancetry = 0, label as "white"; if ancetry = 1, label as "non-white"
  mutate(ancestry = ifelse(ancestry == 0, "white", "non-white")) %>%
  mutate(ancestry = as.factor(ancestry)) %>%
  mutate(sex = as.factor(sex)) %>%
  mutate(sex = ifelse(sex == 0, "male", "female")) %>%
  mutate(vaccine = as.factor(vaccine)) %>%
  # mutate(plate = as.factor(plate)) %>%
  mutate(vaccine = ifelse(vaccine == 0, "standard dose", "high dose"))


# Function to create PCA plot with specified aesthetics for Nature/Science style
plot_pca <- function(data, variable) {
  # Determine if the variable is continuous or discrete
  is_continuous <- is.numeric(data[[variable]])
  
  # Set up color scale based on variable type
  color_scale <- if (is_continuous) {
    scale_color_gradientn(colors = c("#1f78b4", "#fdbf6f", "#d7191c")) # for continuous
  } else {
    scale_color_manual(values = c("#1f78b4", "#d7191c", "#33a02c", "#ff7f00")) # customize for up to 4 discrete levels
  }
  
  # Create plot with appropriate color scale and add a title for the variable
  p <- ggplot(data, aes(x = PC1, y = PC2, color = .data[[variable]])) +
    geom_point(size = 1.8, alpha = 0.8) +
    color_scale +
    labs(
      x = paste("PC1 (", round(MethMatrix_1_pca$sdev[1]^2 / sum(MethMatrix_1_pca$sdev^2) * 100, 2), "%)", sep = ""),
      y = paste("PC2 (", round(MethMatrix_1_pca$sdev[2]^2 / sum(MethMatrix_1_pca$sdev^2) * 100, 2), "%)", sep = ""),
      title = variable,  # Add variable name as the title
      color = variable
    ) +
    theme_classic(base_size = 14) + 
    theme(
      axis.text = element_text(color = "black"),
      axis.title = element_text(),
      legend.title = element_blank(),
      legend.position = "right",
      plot.title = element_text(hjust = 0.5), # Center and bold the title
      panel.border = element_rect(color = "black", fill = NA, size = 0.7),
      panel.grid.major = element_line(color = "gray90", size = 0.3),
      panel.grid.minor = element_blank()
    )
  
  return(p)
}

# List of variables to plot
variables <- c("log_d0_sp_sum", "log_d28_sp_sum", "d28_sc_foldchange",
               "age", "bmi", "Cell_Type_PC1", 
               "sex", 
               "ancestry", "vaccine")

# Generate PCA plots for each variable with titles
plots <- lapply(variables, function(var) plot_pca(MethMatrix_1_pca_df, var))
Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
ℹ Please use the `linewidth` argument instead.
# Combine plots into a grid layout with variable names as titles
combined_plot <- wrap_plots(plots, ncol = 3,
                            axis_titles = "collect")


# Display the combined plot
print(combined_plot)

# save the plot
# ggsave("PCA_plot.png", combined_plot, width = 12, height = 8, units = "in")

Multiple Linear Regression Model (LOOCV)

In this analysis, we consider the following linear model for methylation data:

\[ \mathbf{M} = \mathbf{X} \boldsymbol{\beta} + \boldsymbol{\epsilon}, \quad \boldsymbol{\epsilon} \sim \mathcal{N}_n(0, \sigma^2 \mathbf{I}), \]

where:

  • \(\mathbf{M} \in \mathbb{R}^{n \times p}\) is the methylation matrix (response),
  • \(\mathbf{X} \in \mathbb{R}^{n \times k}\) is the matrix of orthogonalized predictors (e.g., latent factors),
  • \(\boldsymbol{\beta} \in \mathbb{R}^{k \times p}\) is the matrix of regression coefficients, and
  • \(\boldsymbol{\epsilon} \in \mathbb{R}^{n \times p}\) is the matrix of error terms.

Leave-one-out cross-validation (LOOCV) is used to select the best predictive model.

We assume the Gauss–Markov conditions hold, ensuring the least squares estimator \(\hat{\boldsymbol{\beta}}\) is the best linear unbiased estimator (BLUE):

  • Linearity: The model is linear in parameters:

    \[ \mathbf{M} = \mathbf{X} \boldsymbol{\beta} + \boldsymbol{\epsilon} \]

  • Independence: The errors have zero mean and are uncorrelated:

    \[ \mathbb{E}[\boldsymbol{\epsilon}] = \mathbf{0}, \quad \mathrm{Cov}(\boldsymbol{\epsilon}) = \sigma^2 \mathbf{I} \]

  • Homoscedasticity: The error terms have constant variance:

    \[ \mathrm{Var}(\boldsymbol{\epsilon}_i) = \sigma^2, \quad \forall i \]

  • Normality (for inference): The residuals are normally distributed:

    \[ \boldsymbol{\epsilon} \sim \mathcal{N}_n(0, \sigma^2 \mathbf{I}) \]

  • No multicollinearity: The predictors are linearly independent:

    \[ \mathrm{rank}(\mathbf{X}) = k, \quad \text{with } \mathbf{X}^\top \mathbf{X} \text{ invertible} \]

We will fit a multivariate linear regression model to predict the methylation levels using the orthogonalized predictors. We will use leave-one-out cross-validation (LOOCV) to select the best model with the lowest root mean squared error (RMSE).

# Define the multivariate model function
methylation_orthogonalized_continuous_model <- function(MethMatrix, orthogonalized_traits) {
  best_rmse <- Inf  # Initialize with a large number
  
  for (i in 1:nrow(MethMatrix)) {
    
    # Create training and validation sets
    train_MethMatrix <- MethMatrix[-i,]
    train_traits <- orthogonalized_traits[-i,]
    validation_MethMatrix <- MethMatrix[i,, drop = FALSE]
    validation_traits <- orthogonalized_traits[i,, drop = FALSE]
    
    # Fit the model with continuous predictors and factors
    model <- lm(as.matrix(train_MethMatrix) ~ 
                  log_d0_sp_sum + log_d28_sp_sum + d28_sc_foldchange +
                  age + bmi + Cell_Type_PC1 + vaccine +
                  ancestry + sex, 
                data = train_traits)
    
    # Predict on the left-out observation
    predicted_on_validation <- predict(model, newdata = validation_traits)
    
    # Calculate RMSE for the prediction
    current_rmse <- sqrt(mean((as.numeric(validation_MethMatrix) - predicted_on_validation)^2))
    
    # Update the best model if the current RMSE is lower
    if (current_rmse < best_rmse) {
      best_rmse <- current_rmse
      best_model <- model
    }
  }
  
  return(list(best_model = best_model, best_rmse = best_rmse))
}

Run the function with the input as MethMatrix_1 and orthogonalized_traits_PCA (which contains 6 list of orthogonalized traits)

if (file.exists("8.13.25_methylation_orthogonalized_continuous_model.rds")) {
  methylation_orthogonalized_continuous_model <- readRDS("8.13.25_methylation_orthogonalized_continuous_model.rds")
} else {
  methylation_orthogonalized_continuous_model <- methylation_orthogonalized_continuous_model(MethMatrix_1, orthogonalized_traits_PCA)
  saveRDS(methylation_orthogonalized_continuous_model, "8.13.25_methylation_orthogonalized_continuous_model.rds")
}

methylation_orthogonalized_continuous_model[["best_rmse"]]
[1] 0.04090424

MethMatrix values are beta values (ranging from 0 to 1), then RMSE = 0.0409 means the model’s predictions are off by about 4% methylation on average per site, which is usually considered quite accurate in epigenetic modeling.

Model Evaluation

Pseudoinverse of the Coefficient Matrix

Moore-Penrose Inverse Matrix

We aim to evaluate how well the estimated coefficient matrix \(\hat{\boldsymbol{\beta}}\) captures the relationship between the latent factor matrix \(\hat{\mathbf{X}}\) and the observed methylation matrix \(\mathbf{M}\). While the model is trained via:

\[ \mathbf{M} = \mathbf{X} \boldsymbol{\beta} + \boldsymbol{\epsilon}, \quad \boldsymbol{\epsilon} \sim \mathcal{N}_n(0, \sigma^2 \mathbf{I}), \]

our evaluation assumes the methylation matrix \(\mathbf{M}\) remains fixed (i.e., unchanged), and we aim to recover the factors \(\hat{\mathbf{X}}\) from the fitted model.

Assuming a perfect reconstruction under the fitted coefficients, we posit:

\[ \mathbf{M} \approx \hat{\mathbf{X}} \hat{\boldsymbol{\beta}}, \]

and solve for \(\hat{\mathbf{X}}\) using the Moore–Penrose inverse \(\hat{\boldsymbol{\beta}}^{+}\) of \(\hat{\boldsymbol{\beta}}\):

\[ \hat{\mathbf{X}} = \mathbf{M} \hat{\boldsymbol{\beta}}^{+}. \]

Here, \(\hat{\boldsymbol{\beta}}^{+}\) is the unique Moore–Penrose pseudoinverse satisfying the following four Penrose conditions:

\[ \begin{align*} &1.\quad \hat{\boldsymbol{\beta}} \hat{\boldsymbol{\beta}}^{+} \hat{\boldsymbol{\beta}} = \hat{\boldsymbol{\beta}}, \\ &2.\quad \hat{\boldsymbol{\beta}}^{+} \hat{\boldsymbol{\beta}} \hat{\boldsymbol{\beta}}^{+} = \hat{\boldsymbol{\beta}}^{+}, \\ &3.\quad (\hat{\boldsymbol{\beta}} \hat{\boldsymbol{\beta}}^{+})^\top = \hat{\boldsymbol{\beta}} \hat{\boldsymbol{\beta}}^{+}, \\ &4.\quad (\hat{\boldsymbol{\beta}}^{+} \hat{\boldsymbol{\beta}})^\top = \hat{\boldsymbol{\beta}}^{+} \hat{\boldsymbol{\beta}}. \end{align*} \]

This formulation allows us to estimate \(\hat{\mathbf{X}}\) from observed methylation data and visualize its agreement with the true (or originally estimated) latent factors. A close match between \(\hat{\mathbf{X}}\) and \(\mathbf{X}\) would result in predicted values aligning along the identity line in a scatter plot.

Next, We will calculate the pseudoinverse of the coefficient matrix to predict the factors for plotting.

predict_factors <- function(model, meth_matrix) {
  
  # Obtain the coefficient matrix from the multivariate linear model
  coeff_matrix <- coef(model)

  
  # Calculate the pseudoinverse of the coefficient matrix
  pseudoinv_coeff <- ginv(coeff_matrix)
  
  # Predict factors
  predicted_factors <- as.matrix(meth_matrix) %*% pseudoinv_coeff %>%
    as_tibble() %>%
    set_names(rownames(coeff_matrix))
  
  return(predicted_factors)
}
predicted_factors <- predict_factors(methylation_orthogonalized_continuous_model$best_model,
                                     MethMatrix_1)
Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
`.name_repair` is omitted as of tibble 2.0.0.
ℹ Using compatibility `.name_repair`.

Predicted vs. the Original Factors

Calculate the predicted factors using the function predict_factors and plot the predicted vs actual factors using the function plot_predicted_vs_actual to all the 6 models.

library(ggpmisc) # for stat_poly_eq
library(yardstick)

# Packages needed: ggplot2, ggpp, ggpmisc, dplyr, stringr, yardstick
plot_predicted_vs_actual_1 <- function(predicted_factors, actual_factors) {
  # ----- Long data
  common <- intersect(names(predicted_factors), names(actual_factors))
  df_new <- do.call(rbind, lapply(common, function(v)
    data.frame(predicted = predicted_factors[[v]],
               actual    = actual_factors[[v]],
               variable  = v, stringsAsFactors = FALSE)
  ))

  # ----- Pretty names & order
  df_new$variable <- gsub("_", " ", df_new$variable) |> tools::toTitleCase()
  df_new$variable <- stringr::str_replace_all(
    df_new$variable,
    c("log D0 Sp Sum"     = "Log Day 0 Seroprotection Sum",
      "log D28 Sp Sum"    = "Log Day 28 Seroprotection Sum",
      "D28 Sc Foldchange" = "Day 28 Seroconversion Foldchange",
      "Bmi"               = "BMI")
  )
  ordered_levels <- c("Log Day 0 Seroprotection Sum",
                      "Log Day 28 Seroprotection Sum",
                      "Day 28 Seroconversion Foldchange",
                      "Age","Sex","BMI","Cell Type PC1","Ancestry","Vaccine")
  df_new$variable <- factor(df_new$variable, levels = ordered_levels)

  # ----- Metrics per facet
  df_metrics <- df_new |>
    dplyr::group_by(variable) |>
    dplyr::summarise(
      RMSE = yardstick::rmse_vec(truth = actual, estimate = predicted, na_rm = TRUE),
      MAE  = yardstick::mae_vec (truth = actual, estimate = predicted, na_rm = TRUE),
      .groups = "drop"
    )

  df_r2 <- df_new |>
    dplyr::group_by(variable) |>
    dplyr::summarise(
      adjR2 = summary(lm(predicted ~ actual))$adj.r.squared,
      .groups = "drop"
    )

  # ----- Plot
  ggplot(df_new, aes(actual, predicted)) +
    geom_point(size = 1.5, alpha = 0.7, color = "black", show.legend = FALSE) +

    # Identity & OLS lines (legend uses horizontal glyphs)
    geom_abline(
      aes(slope = 1, intercept = 0,
          color = "Identity (y = x)", linetype = "Identity (y = x)"),
      linewidth = 1.3, key_glyph = "path"
    ) +
    geom_smooth(
      aes(color = "OLS fit", linetype = "OLS fit"),
      method = "lm", se = FALSE, linewidth = 0.8, key_glyph = "path"
    ) +

    facet_wrap(~ variable, scales = "free", ncol = 3) +
    labs(x = "Actual", y = "Predicted", title = NULL) +

    # ---- Text overlays (left-aligned block) ----
    # Bold italic(R)[adj]^2
    ggpp::geom_text_npc(
      data = transform(df_r2,
                       label_math = sprintf('bold(italic(R)[adj]^2)~`=`~bold(%.2f)', adjR2)),
      aes(label = label_math, npcx = 0.03, npcy = 0.98),
      parse = TRUE, hjust = 0, vjust = 1, size = 3
    ) +
    # RMSE (looser spacing)
    ggpp::geom_text_npc(
      data = df_metrics,
      aes(label = sprintf("RMSE = %.2f", RMSE),
          npcx = 0.03, npcy = 0.89),
      hjust = 0, vjust = 1, size = 3
    ) +
    # MAE (looser spacing)
    ggpp::geom_text_npc(
      data = df_metrics,
      aes(label = sprintf("MAE = %.2f", MAE),
          npcx = 0.03, npcy = 0.82),
      hjust = 0, vjust = 1, size = 3
    ) +

    # Legend (keep labels; make OLS semi-transparent via color scale)
    scale_color_manual(
      name = NULL,
      values = c(
        "OLS fit" = "dodgerblue", 
        "Identity (y = x)" = "red"
      )
    ) +
    scale_linetype_manual(
      name = NULL,
      values = c("OLS fit" = "solid", "Identity (y = x)" = "dashed")
    ) +

    theme_classic(base_size = 12) +
    theme(
      strip.background = element_rect(fill = "gray90", color = NA),
      strip.text = element_text(face = "bold", size = 11),
      axis.title = element_text(face = "bold"),
      legend.position = "bottom",
      legend.direction = "horizontal",
      legend.box = "horizontal",
      legend.title  = element_blank(),
      legend.text   = element_text(size = 10, face = "bold")
    ) +
    guides(
      color = guide_legend(nrow = 1),
      linetype = guide_legend(nrow = 1)
    )
}
methylation_predicted_vs_actual_plot <- plot_predicted_vs_actual_1(predicted_factors, orthogonalized_traits_PCA)
methylation_predicted_vs_actual_plot

# ggsave("/Users/jacenai/Desktop/Matteo_Lab/UGA6/methylation/final_files/plots/methylation_predicted_vs_actual_plot.png", methylation_predicted_vs_actual_plot, width = 9.5, height = 10.3)

EWAS

We also interested in the which CpG sites are associated with the traits. Therefore, we will perform a EWAS analysis to identify the CpG sites that are highly associated with the traits. By the multivariate linear model above, we can get the coefficients and t-test’s p-values for each predictor in each CpG site. After adjusting for multiple testing for the p-values, we can identify the CpG sites that are significantly associated with the traits.

# Function to get model summary for a specific variable
get_model_summary <- function(model_summary, variable) {
  model_summary |>
    filter(term == unique(model_summary$term[grepl(variable, model_summary$term)])) |>
    # Add the column to label the response variable
    mutate(variable = variable) |>
    # mutate(pValue_adjusted = p.adjust(p.value, method = 'BH')) |>
    dplyr::select(response, estimate, std.error, statistic, p.value, variable) |>
    rename_with(~c("CpG_site", "Coefficient", "StdError", "tValue", "pValue", "label"), everything())
}

# model and tidy table
fit <- readRDS("8.13.25_methylation_orthogonalized_continuous_model.rds")
# fit <- readRDS("1.24.25_methylation_orthogonalized_continuous_model.rds")
tidy_tbl <- tidy(fit$best_model)

# variables to extract
vars <- c("age","sex","bmi","Cell_Type","ancestry","vaccine",
          "log_d0_sp_sum","log_d28_sp_sum","d28_sc_foldchange")

# one-liner to build the combined summary
ewas_summary <- map_dfr(vars, ~ get_model_summary(tidy_tbl, .x))

Here I use the Benjamini-Hochberg method to adjust the p-values. Because we have tremendous tests where some false positives are tolerable but controlling the overall error rate is key.

# add the adjusted p-value
ewas_summary <- ewas_summary |>
  mutate(pValue_adjusted = p.adjust(pValue, method = 'BH'))

Manhattan Plot

Create a dataframe to store EWAS data

# Create a function to get EWAS data frame 
create_ewas_df <- function(results_df, annotation_df) {
  annotation_df <- annotation_df %>%
    dplyr::select(CpG, Gene) %>%
    left_join(results_df, by = c("CpG" = "CpG_site"))
  
  ewas_df <- data.frame(
    CPG = annotation_df$CpG,
    CHR = gsub(".*chr(\\w+)_.*", "\\1", annotation_df$CpG),
    BP = as.numeric(gsub(".*_([0-9]+)$", "\\1", annotation_df$CpG)),
    adjP = results_df$pValue_adjusted
  ) |>
    mutate(CPG = ifelse(CPG == "X", 23, CPG)) |>
    arrange(CPG, BP)
  
  return(ewas_df)
}
age_ewas_df <- create_ewas_df(ewas_summary[ewas_summary$label == "age",], cpg_annotation_df)
sex_ewas_df <- create_ewas_df(ewas_summary[ewas_summary$label == "sex",], cpg_annotation_df)
bmi_ewas_df <- create_ewas_df(ewas_summary[ewas_summary$label == "bmi",], cpg_annotation_df)
ancestry_ewas_df <- create_ewas_df(ewas_summary[ewas_summary$label == "ancestry",], cpg_annotation_df)
vaccine_ewas_df <- create_ewas_df(ewas_summary[ewas_summary$label == "vaccine",], cpg_annotation_df)
cell_type_ewas_df <- create_ewas_df(ewas_summary[ewas_summary$label == "Cell_Type",], cpg_annotation_df)
log_d0_sp_sum_ewas_df <- create_ewas_df(ewas_summary[ewas_summary$label == "log_d0_sp_sum",], cpg_annotation_df)
log_d28_sp_sum_ewas_df <- create_ewas_df(ewas_summary[ewas_summary$label == "log_d28_sp_sum",], cpg_annotation_df)
d28_sc_foldchange_ewas_df <- create_ewas_df(ewas_summary[ewas_summary$label == "d28_sc_foldchange",], cpg_annotation_df)

Find out those significant CpG sites under the threshold of p_vlaue < 0.05 for variables log_d0_sp_sum, log_d28_sp_sum, d28_sc_foldchange

Code
# Define a function to generate a refined Manhattan plot for publication
generate_manhattan_plot_1 <- function(data, title) {
  # Compute chromosome size
  chromosome_size <- 
    data %>%
    mutate(CHR = as.numeric(CHR)) %>%
    group_by(CHR) %>% 
    summarize(chr_len = max(BP)) %>% 
    mutate(chr_len = as.numeric(chr_len)) %>%
    mutate(tot = cumsum(chr_len) - chr_len) %>%
    dplyr::select(-chr_len) %>%
    mutate(CHR = as.character(CHR)) %>%
    
    # Add this info to the initial dataset
    left_join(data, ., by = "CHR") %>%
    
    # Add a cumulative position of each CPG
    arrange(CHR, BP) %>%
    mutate(BPcum = BP + tot) %>%
    mutate(is_annotate = ifelse(-log10(adjP) > -log10(0.05), "yes", "no"))

  
  # Define the chromosome center for x-axis labels
  axisdf <- chromosome_size %>%
    mutate(CHR = as.numeric(CHR)) %>%
    group_by(CHR) %>%
    summarize(center = (max(BPcum) + min(BPcum)) / 2 ) %>%
    filter(CHR %% 2 == 1 | CHR == 24)  # Display odd chromosomes + "24" for X

  # Format title
  title <- gsub("_", " ", title) |> tolower() |> stringr::str_to_title()
  
  # Generate the Manhattan plot
  p <- ggplot(chromosome_size, aes(x = BPcum, y = -log10(adjP))) +
    # Points for all CPGs
    geom_point(aes(color = as.factor(CHR)), alpha = 0.8, size = 1.3) +
    scale_color_manual(values = rep(c("skyblue2", "skyblue4"), 22)) +
    
    # Custom X axis to show chromosome labels
    scale_x_continuous(
      labels = c(seq(1, 22, 2), "X"),
      breaks = axisdf$center,
      expand = c(0.01, 0)
    ) +
    
    # Y axis scale with limits
    scale_y_continuous(expand = c(0, 0), 
                       limits = c(0, max(-log10(data$adjP), na.rm = TRUE) + 1)) +
    
    # Highlight significant CPGs in bright colors
    # geom_point(data = subset(chromosome_size, is_annotate == "yes"), 
    #            color = "darkgreen", alpha = 0.8, size = 1.3) +
    
    # Add genome-wide significance lines
    geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "blue", size = 0.5) +
    geom_hline(yintercept = -log10(0.01), linetype = "dashed", color = "red", size = 0.5) +
    
    # Theme adjustments for publication quality
    theme_bw() +
    theme(
      legend.position = "none",
      panel.border = element_blank(),
      panel.grid.major.x = element_blank(),
      panel.grid.minor.x = element_blank(),
      panel.grid.major.y = element_line(color = "gray90"),
      panel.grid.minor.y = element_blank(),
      axis.line = element_line(color = "black"),
      axis.title.x = element_text(size = 12, face = "bold"),
      axis.title.y = element_text(size = 12, face = "bold"),
      plot.title = element_text()
    ) +
    
    # Labels
    labs(
      title = title,
      x = element_blank(),
      y = element_blank()
    )
  
  return(adjP)
}
Code
# List of data frames and corresponding titles
ewas_data <- list(
  log_d0_sp_sum_ewas_df, 
  log_d28_sp_sum_ewas_df, 
  d28_sc_foldchange_ewas_df,
  age_ewas_df, 
  sex_ewas_df, 
  bmi_ewas_df, 
  cell_type_ewas_df, 
  ancestry_ewas_df,
  vaccine_ewas_df
)
titles <- c("Log D0 Sp Sum", "Log D28 Sp Sum", "D28 SC Foldchange",
            "Age", "Sex", "BMI", "Cell Types", "Ancestry",
            "Vaccine"
            )

# Generate all plots in the specified order
plots <- Map(generate_manhattan_plot_pub, ewas_data, titles)

# Arrange the plots in the specified order
figure <- ggarrange(plotlist = plots, ncol = 3, nrow = 3)

# Annotate the figure with custom axis labels
methylation_manhattan_plot <-
annotate_figure(figure,
                bottom = text_grob("Chromosome"),
                left = text_grob(expression(-log[10]("adj P-value")), rot = 90))

# save the plot
# ggsave("methylation_manhattan_plot.png", methylation_manhattan_plot, width = 12, height = 12, units = "in")

Plot the Manhattan plots

# Faceted, publication-style Manhattan plot
# Inputs:
#   ewas_list : named or unnamed list of data.frames, each with columns CHR, BP, adjP, (optional CPG)
#   titles    : character vector of panel titles in the same order as ewas_list (if ewas_list not named)
# Options:
#   genomewide_p : horizontal dotted line (default 5e-8)
#   ncol         : number of columns in facet layout
generate_manhattan_faceted <- function(ewas_list,
                                       titles,                 # REQUIRED: order/keys (e.g., short labels)
                                       pretty_titles = NULL,   # shown in strips
                                       genomewide_p = 5e-8,
                                       ncol = 3) {
  stopifnot(is.list(ewas_list), length(ewas_list) > 0)
  stopifnot(length(titles) == length(ewas_list))

  # Bind traits and fix order
  all_df <- purrr::map2_dfr(ewas_list, titles, ~{
    df <- .x
    stopifnot(all(c("CHR","BP","adjP") %in% names(df)))
    df %>% mutate(Trait = .y)
  })
  all_df$Trait <- factor(all_df$Trait, levels = titles)

  # Standardize chromosomes & compute -log10(adjP)
  all_df <- all_df %>%
    mutate(
      CHR = case_when(
        CHR %in% c("X","x",23L)  ~ 23L,
        CHR %in% c("Y","y",24L)  ~ 24L,
        CHR %in% c("M","MT","m") ~ 25L,
        TRUE ~ suppressWarnings(as.integer(CHR))
      ),
      neglogadjP = -log10(adjP)
    ) %>%
    filter(!is.na(CHR), !is.na(BP), !is.na(adjP)) %>%
    arrange(Trait, CHR, BP)

  # Cumulative position within each trait
  chr_info <- all_df %>%
    group_by(Trait, CHR) %>%
    summarise(chr_len = max(BP, na.rm = TRUE), .groups = "drop_last") %>%
    arrange(Trait, CHR) %>%
    mutate(tot = cumsum(chr_len) - chr_len) %>%
    ungroup()

  all_df <- all_df %>%
    left_join(chr_info, by = c("Trait","CHR")) %>%
    mutate(BPcum = BP + tot)

  # Axis ticks at chromosome centers
  axis_df <- all_df %>%
    group_by(Trait, CHR) %>%
    summarise(center = (min(BPcum) + max(BPcum)) / 2, .groups = "drop") %>%
    mutate(label = case_when(
      CHR == 23 ~ "X",
      CHR == 24 ~ "Y",
      CHR == 25 ~ "MT",
      TRUE      ~ as.character(CHR)
    )) %>%
    # keep odd autosomes + X/Y/MT
    filter(CHR %% 2 == 1 | CHR %in% c(23L, 24L, 25L))

  # Thresholds legend (red dotted = genomewide_p; blue dashed = 0.05)
  gw_label <- sprintf("-log10(p = %.2g)", genomewide_p)
  thresh_df <- tibble(
    line = c(gw_label, "-log10(p = 0.05)"),
    y    = c(-log10(genomewide_p), -log10(0.05))
  )

  # Point fills (alternate neutrals)
  alt_fills <- rep(c("skyblue2", "skyblue4"), 100)

  # Optional pretty strip labels
  lab_map <- if (!is.null(pretty_titles)) {
    stopifnot(length(pretty_titles) == length(titles))
    setNames(pretty_titles, titles)
  } else NULL

  ggplot(all_df, aes(BPcum, neglogadjP)) +
    geom_point(aes(fill = as.factor(CHR)),
               shape = 21, size = 1.6, alpha = 0.85, stroke = 0,
               show.legend = FALSE) +
    scale_fill_manual(values = alt_fills, guide = "none") +

    geom_hline(data = thresh_df,
               aes(yintercept = y, color = line, linetype = line),
               linewidth = 0.6, show.legend = TRUE) +
    scale_linetype_manual(
      name = NULL,
      values = setNames(c("dashed", "dashed"), c(gw_label, "-log10(p = 0.05)"))
    ) +
    scale_color_manual(
      name = NULL,
      values = setNames(c("#C62828", "orange"), c(gw_label, "-log10(p = 0.05)")),
      guide = "legend"
    ) +

    facet_wrap(~ Trait, scales = "free", ncol = ncol,
               labeller = if (is.null(lab_map)) label_value else labeller(Trait = lab_map)) +

    scale_x_continuous(
      breaks = axis_df$center,
      labels = axis_df$label,
      expand = c(0.01, 0)
      # guide = guide_axis(n.dodge = 2)  # stagger if still tight
    ) +
    labs(x = "Chromosome", y = expression(-log10(italic(P)[adj]))) +
    theme_classic(base_size = 10) +
    theme(
      strip.background = element_rect(fill = "gray90", color = NA),
      strip.text   = element_text(face = "bold", size = 11),
      panel.border = element_rect(color = "black", fill = NA, linewidth = 1),
      axis.title.x = element_text(face = "bold"),
      axis.title.y = element_text(face = "bold"),
      axis.text.x  = element_text(size = 8),
      legend.position = "bottom",
      legend.direction = "horizontal",
      legend.box = "horizontal",
      legend.text = element_text(size = 9)
    ) +
    guides(color = guide_legend(nrow = 1), linetype = guide_legend(nrow = 1))
}


titles <- c("Log Day 0 Seroprotection Sum",
            "Log Day 28 Seroprotection Sum",
            "Day 28 Seroconversion Foldchange",
            "Age","Sex","BMI","Cell Type PC1","Ancestry","Vaccine")

ewas_list <- list(
  log_d0_sp_sum_ewas_df,
  log_d28_sp_sum_ewas_df,
  d28_sc_foldchange_ewas_df,
  age_ewas_df,
  sex_ewas_df,
  bmi_ewas_df,
  cell_type_ewas_df,
  ancestry_ewas_df,
  vaccine_ewas_df
)

p <- generate_manhattan_faceted(ewas_list, titles = titles, genomewide_p = 0.01, ncol = 3)
p

# ggsave("/Users/jacenai/Desktop/Matteo_Lab/UGA6/methylation/final_files/plots/manhattan_facet.png", p, width=15, height=10, dpi=300)

Significant CpG sites

ewas_summary %>%
  filter(label == "log_d0_sp_sum") %>%
  left_join(cpg_annotation_df, by = c("CpG_site" = "CpG")) %>%
  filter(pValue_adjusted < 0.05) %>% .$Gene %>% unique() %>% 
  .[. != ""] -> d0_sp_gene_list


ewas_summary %>%
  filter(label == "d28_sc_foldchange") %>%
  left_join(cpg_annotation_df, by = c("CpG_site" = "CpG")) %>%
  filter(pValue_adjusted < 0.05) %>% .$Gene %>% unique() %>% 
  .[. != ""] -> d28_sc_gene_list


ewas_summary %>%
  filter(label == "log_d28_sp_sum") %>%
  left_join(cpg_annotation_df, by = c("CpG_site" = "CpG")) %>%
  filter(pValue_adjusted < 0.05) %>% .$Gene %>% unique() %>%
  .[. != ""] -> d28_sp_gene_list

Using Vann diagram to show the overlap of significant CpG corresponding or nearby genes

library(ggVennDiagram)
library(ggrepel)

# Create a list of gene sets for the Venn diagram
gene_sets <- list(
  "D28 Sc Fold Change" = d28_sc_gene_list,
  "D28 Sp Sum" = d28_sp_gene_list,
  "D0 Sp Sum" = d0_sp_gene_list
)

# figure out which one is in the overlap of three
overlap_genes <- Reduce(intersect, gene_sets)

# Plot the Venn diagram
p <- ggVennDiagram(gene_sets, set_size = 3.5, edge_size = 0.5, relative_width = 2, relative_height = 2) +
  scale_fill_gradient(low = "white", high = "skyblue3", guide = "none") +
  labs(title = "Significant Genes Detected by Methylation Model") +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16, hjust = 0.5),  # Centered and bold title
    legend.position = "none",  # Hide the legend for a cleaner look
    panel.grid = element_blank(),  # Remove grid lines
    axis.title = element_blank(),  # Hide axis titles
    axis.text = element_blank(),  # Hide axis text
    axis.ticks = element_blank()  # Hide axis ticks
  )

p

Using gtsummary to create a summary table of significant CpG corresponding or nearby genes

library(gt)

# Prepare the data frame with gene lists as single concatenated strings
genes_df <- tibble(
  `D28 Sc Fold Change` = paste(d28_sc_gene_list, collapse = ", "),
  `D28 Sp Sum` = paste(d28_sp_gene_list, collapse = ", "),
  `D0 Sp Sum` = paste(d0_sp_gene_list, collapse = ", ")
)

# Create the table with gt
table <- genes_df %>%
  as.data.frame() %>%
  t() %>%
  as.data.frame() %>%
  rownames_to_column(var = "Groups") %>%
  gt() %>%
  # tab_header(
  #   title = "Significant Genes Detected by Methylation Model") %>%
  cols_label(
    `V1` = "Genes"
  ) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_column_labels()
  ) %>%
  tab_options(
    table.font.size = 12,
    table.border.top.color = "black",
    table.border.bottom.color = "black",
    column_labels.border.top.color = "black",
    column_labels.border.bottom.color = "black",
    heading.align = "center"
  )

# Display the table
table
# Function to save filtered methylation data
save_filtered_methylation <- function(ewas_df, annotation_df, meth_mat, p_threshold = 0.05) {
  ewas_df %>%
    filter(!is.na(adjP), adjP < p_threshold) %>%             # NA-safe p-value filter
    left_join(annotation_df, by = c("CPG" = "CpG")) %>% # Join annotation
    left_join(meth_mat,      by = c("CPG" = "sampleID")) %>%
    dplyr::rename(
      CpG_site = CPG,
      pValue_adjusted = adjP
    ) 
}

# Batch runner: write one CSV per EWAS
save_filtered_methylation_batch <- function(named_ewas_list,
                                            annotation_df,
                                            meth_mat,
                                            out_dir,
                                            date_tag = "1.22.25",
                                            p_threshold = 0.05) {
  dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)
  iwalk(named_ewas_list, function(ewas_df, label) {
    out <- save_filtered_methylation(ewas_df, annotation_df, meth_mat, p_threshold)
    write_csv(out, file.path(out_dir, sprintf("%s_%s_sigCpGs.csv", date_tag, label)))
  })
}

# Define inputs 
out_dir  <- "/Users/jacenai/Desktop/Matteo_Lab/UGA6/methylation/final_files/sig_CpGs"
date_tag <- "8.17.25"

ewas_list <- list(
  log_d0_sp_sum     = log_d0_sp_sum_ewas_df,
  log_d28_sp_sum    = log_d28_sp_sum_ewas_df,
  d28_sc_foldchange = d28_sc_foldchange_ewas_df,
  age               = age_ewas_df,
  bmi               = bmi_ewas_df,
  cell_type         = cell_type_ewas_df,
  vaccine           = vaccine_ewas_df,
  ancestry          = ancestry_ewas_df,
  sex               = sex_ewas_df
)

# run the function to save significant CpG sites
save_filtered_methylation_batch(
  named_ewas_list = ewas_list,
  annotation_df   = cpg_annotation_df,
  meth_mat        = MethMatrix %>% column_to_rownames("sampleID") %>% 
    t() %>% as.data.frame() %>% rownames_to_column("sampleID"),
  out_dir         = out_dir,
  date_tag        = date_tag,
  p_threshold     = 0.05
)